{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "12bae709-5500-4f75-aa92-bebdba616e6e",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from bertopic import BERTopic\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "91775253-184d-4ba4-9df2-b41f403e3f25",
   "metadata": {},
   "outputs": [],
   "source": [
    "%run -i \"../util/lang_utils.ipynb\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "511be447-3523-420e-8973-6d6a0af1d112",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "           category                                               text\n",
      "0              tech  tv future in the hands of viewers with home th...\n",
      "1          business  worldcom boss  left books alone  former worldc...\n",
      "2             sport  tigers wary of farrell  gamble  leicester say ...\n",
      "3             sport  yeading face newcastle in fa cup premiership s...\n",
      "4     entertainment  ocean s twelve raids box office ocean s twelve...\n",
      "...             ...                                                ...\n",
      "2220       business  cars pull down us retail figures us retail sal...\n",
      "2221       politics  kilroy unveils immigration policy ex-chatshow ...\n",
      "2222  entertainment  rem announce new glasgow concert us band rem h...\n",
      "2223       politics  how political squabbles snowball it s become c...\n",
      "2224          sport  souness delight at euro progress boss graeme s...\n",
      "\n",
      "[2225 rows x 2 columns]\n"
     ]
    }
   ],
   "source": [
    "stop_words = stopwords.words('english')\n",
    "stop_words.append(\"said\")\n",
    "stop_words.append(\"mr\")\n",
    "bbc_df = pd.read_csv(\"../data/bbc-text.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "f9fa0ede-5a4b-4d6e-9370-a9091f9a701f",
   "metadata": {},
   "outputs": [],
   "source": [
    "bbc_df[\"text\"] = bbc_df[\"text\"].apply(lambda x: word_tokenize(x))\n",
    "bbc_df[\"text\"] = bbc_df[\"text\"].apply(lambda x: [w for w in x if w not in stop_words])\n",
    "bbc_df[\"text\"] = bbc_df[\"text\"].apply(lambda x: \" \".join(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "fcd971d9-24a1-4ce1-987c-d46452aa222f",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2002\n",
      "223\n"
     ]
    }
   ],
   "source": [
    "bbc_train, bbc_test = train_test_split(bbc_df, test_size=0.1)\n",
    "print(len(bbc_train))\n",
    "print(len(bbc_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "034d5c50-7313-4f37-9c97-038c63103e42",
   "metadata": {},
   "outputs": [],
   "source": [
    "docs = bbc_train[\"text\"].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "024b1c8c-7938-4a8a-895c-62aefc5290d2",
   "metadata": {},
   "outputs": [],
   "source": [
    "topic_model = BERTopic(nr_topics=6)\n",
    "topics, probs = topic_model.fit_transform(docs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "1eb713ac-eae0-4c1e-bc5a-bd8e15153925",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "   Topic  Count                                 Name  \\\n",
      "0     -1    222             -1_also_company_china_us   \n",
      "1      0    463             0_england_game_win_first   \n",
      "2      1    393      1_would_labour_government_blair   \n",
      "3      2    321             2_film_best_music_awards   \n",
      "4      3    309  3_people_mobile_technology_software   \n",
      "5      4    294             4_us_year_growth_economy   \n",
      "\n",
      "                                      Representation  \\\n",
      "0  [also, company, china, us, would, year, new, p...   \n",
      "1  [england, game, win, first, club, world, playe...   \n",
      "2  [would, labour, government, blair, election, p...   \n",
      "3  [film, best, music, awards, show, year, band, ...   \n",
      "4  [people, mobile, technology, software, digital...   \n",
      "5  [us, year, growth, economy, economic, company,...   \n",
      "\n",
      "                                 Representative_Docs  \n",
      "0  [us retail sales surge december us retail sale...  \n",
      "1  [ireland win eclipses refereeing errors intern...  \n",
      "2  [lib dems unveil election slogan liberal democ...  \n",
      "3  [scissor sisters triumph brits us band scissor...  \n",
      "4  [mobiles media players yet mobiles yet ready a...  \n",
      "5  [consumer spending lifts us growth us economic...  \n"
     ]
    }
   ],
   "source": [
    "print(topic_model.get_topic_info())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "ab8c0892-1a4c-48a9-aa0b-f466d7df3c97",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('england', 0.023923609275000306), ('game', 0.023874910540888444), ('win', 0.02089139078895572), ('first', 0.019051267767033135), ('club', 0.017470682428724963), ('world', 0.017352529044283988), ('players', 0.016899062892940703), ('cup', 0.016893627391621816), ('last', 0.01665891773908297), ('two', 0.016416826185772164)]\n"
     ]
    }
   ],
   "source": [
    "print(topic_model.get_topic(0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "cbfab82e-1453-4359-ab33-323f10d32bfb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('would', 0.035433209961128954), ('labour', 0.03291644460816451), ('government', 0.02936700096809325), ('blair', 0.02702878364855685), ('election', 0.026944432307687366), ('party', 0.02566497689534048), ('people', 0.023100026902541076), ('brown', 0.02021005312555692), ('minister', 0.020034108662757368), ('also', 0.01606611985845893)]\n"
     ]
    }
   ],
   "source": [
    "print(topic_model.get_topic(1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "b9e239ec-e501-49c6-bb5f-fe630b9c59f3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('film', 0.04996542271015715), ('best', 0.03954338090393644), ('music', 0.02608204595923062), ('awards', 0.022660883747149464), ('show', 0.019911582970118826), ('year', 0.019569114192597062), ('band', 0.01947269693504342), ('also', 0.01936668159798611), ('award', 0.019271695654981706), ('one', 0.018568435901997235)]\n"
     ]
    }
   ],
   "source": [
    "print(topic_model.get_topic(2))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "8cb54778-0fcd-409a-804a-864bbe5ea699",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('people', 0.029907130205424383), ('mobile', 0.024988214218395956), ('technology', 0.02246648997447284), ('software', 0.01907466920671406), ('digital', 0.01846667229083264), ('music', 0.018089961948297602), ('users', 0.01779304168540416), ('one', 0.017635710770530333), ('also', 0.017408837514157897), ('new', 0.01738824388413855)]\n"
     ]
    }
   ],
   "source": [
    "print(topic_model.get_topic(3))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "1ba5fbc4-ee8d-483a-894f-a981c5310dde",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('us', 0.03584149650622307), ('year', 0.02528335130307409), ('growth', 0.024390583399937417), ('economy', 0.021929580756366116), ('economic', 0.019792138393691337), ('company', 0.019580005023360762), ('yukos', 0.018906513989042615), ('market', 0.01801654052491897), ('oil', 0.017884696942644544), ('firm', 0.01642415825482552)]\n"
     ]
    }
   ],
   "source": [
    "print(topic_model.get_topic(4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "7c4e79a4-4cd7-4a22-bbc7-49248abbd3ef",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['-1_also_company_china_us_would',\n",
       " '0_england_game_win_first_club',\n",
       " '1_would_labour_government_blair_election',\n",
       " '2_film_best_music_awards_show',\n",
       " '3_people_mobile_technology_software_digital',\n",
       " '4_us_year_growth_economy_economic']"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "topic_model.generate_topic_labels(nr_words=5, topic_prefix=True, separator='_')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "8c55ed71-4c72-420a-8bdd-7950adcc77b8",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_prediction(input_text, model):\n",
    "    pred = model.transform(input_text)\n",
    "    pred = pred[0][0]\n",
    "    return pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "0d8f02bb-6f7a-4384-8234-b58e10507356",
   "metadata": {},
   "outputs": [],
   "source": [
    "bbc_test[\"prediction\"] = bbc_test[\"text\"].apply(lambda x: get_prediction(x, topic_model))\n",
    "topic_mapping = {0:\"sport\", 1:\"politics\", 2:\"entertainment\", 3:\"tech\", 4:\"business\", -1:\"discard\"}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "cef0a892-b3cb-4ab1-b81c-474f168b9125",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "               precision    recall  f1-score   support\n",
      "\n",
      "     business       0.95      0.86      0.90        21\n",
      "entertainment       0.97      1.00      0.98        30\n",
      "     politics       0.94      1.00      0.97        46\n",
      "        sport       1.00      1.00      1.00        62\n",
      "         tech       0.96      0.88      0.92        25\n",
      "\n",
      "     accuracy                           0.97       184\n",
      "    macro avg       0.96      0.95      0.95       184\n",
      " weighted avg       0.97      0.97      0.97       184\n",
      "\n"
     ]
    }
   ],
   "source": [
    "bbc_test[\"pred_category\"] = bbc_test[\"prediction\"].apply(lambda x: topic_mapping[x])\n",
    "test_data = bbc_test.loc[bbc_test['prediction'] != -1]\n",
    "print(classification_report(test_data[\"category\"], test_data[\"pred_category\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "32a19ff5-1b16-4610-9104-50030882ea6d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "howard dismisses tory tax fears michael howard dismissed fears conservatives plans £4bn tax cuts modest . defended package saying plan tories first budget hoped able go . tories monday highlighted £35bn wasteful spending would stop allow tax cuts reduced borrowing spending key services . labour liberal democrats say party sums add claim would cut frontline services . tory tax plan follows complaints party mps howard shadow chancellor oliver letwin taken long unveil proposals . promised figure yet reveal taxes would targeted . tory backbencher edward leigh proposals step right direction told financial times : would come sooner much greater tax cuts . interviewed bbc radio 2 jeremy vine show howard : perfectly true attacked one side people think ought promising much much bigger tax cuts spending cuts . side people say able achieve tax cuts . think got right . howard voters faced clear choice next election waste tax labour tory value money lower taxes . added : would like able time sure able start got recognise limit one go first budget . got responsible . latest tory plans came campaigning election - widely expected may - gathered pace . liberal democrats launched pre-election platform leader charles kennedy saying party authentic opposition particularly iraq war council tax university tuition fees . lib dem treasury spokesman vince cable also branded tory plans fantasy economics . labour hit back tory proposals even publication election coordinator alan milburn accusing howard producing fraudulent prospectus . party tuesday challenged tories publish full report david james trouble-shooter asked identify possible savings . tories turn demanding tony blair spell taxes would raise wins election .\n"
     ]
    }
   ],
   "source": [
    "new_input = bbc_test[\"text\"].iloc[0]\n",
    "print(new_input)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "02633bf4-2c05-4db8-b760-1104298fbc26",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "([1], array([1.]))\n"
     ]
    }
   ],
   "source": [
    "print(topic_model.transform(new_input))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "bf9283dd-fa69-49b3-8d57-7c7c5b19d176",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(0, 0.29033981040460977), (3, 0.049293092462828376), (-1, -0.0047265937178774895), (2, -0.02074380026102955), (4, -0.03699168959416969)]\n"
     ]
    }
   ],
   "source": [
    "topics, similarity = topic_model.find_topics(\"sports\", top_n=5)\n",
    "sim_topics = list(zip(topics, similarity))\n",
    "print(sim_topics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "7fc0a66d-8dea-4232-a014-4a6271f6097e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(4, 0.29003573983158404), (-1, 0.26259758927249205), (3, 0.15627005753581313), (1, 0.05491237184012845), (0, 0.010567363445904386)]\n"
     ]
    }
   ],
   "source": [
    "topics, similarity = topic_model.find_topics(\"business and economics\", top_n=5)\n",
    "sim_topics = list(zip(topics, similarity))\n",
    "print(sim_topics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "5e5e7a0d-3dce-4b5d-89bd-df5e82863f35",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[(3, 0.2540850599909866), (-1, 0.172097560474608), (2, 0.1367798346494483), (4, 0.10243553209139492), (1, 0.06954579004136925)]\n"
     ]
    }
   ],
   "source": [
    "input_text = \"\"\"YouTube removed a snippet of code that publicly disclosed whether a channel receives ad payouts, \n",
    "obscuring which creators benefit most from the platform.\"\"\"\n",
    "topics, similarity = topic_model.find_topics(input_text, top_n=5)\n",
    "sim_topics = list(zip(topics, similarity))\n",
    "print(sim_topics)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c831229-e0df-47a2-a5c5-3112457091bf",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
